##
## 
## This script constructs the vendor distance outcome at the hospital -year level
## 
##



library(tidyverse)
#library(reshape2)
library(readxl)
library(data.table)
library(foreign)
library(haven)

# directories 
fpath_himss = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/replication_files/himss/output/"


# read in the data and clean up 
read <- function(f_data = f_data, y) {
  
  d = read_dta(f_data)
  d = data.table(d)

  d = d[year == y, ]
  
  keep = d[, colSums(.SD), .SDcols = names(d)[str_detect(names(d), "v_")]]
  keep = keep[keep > 0]
  d = d[, .SD, 
        .SDcols = c("target", "acq_legacy", "target2", 
                    "acq_other", "acqhosp",
                           "id", "year", names(keep))]
  

  return(d)
}


# for a set of observations between hospitals, compute the pairwise
# correlations
avg_dist <- function(ref, other) {
  
  output = ref[, 'id']
  
  keep = names(ref)[str_detect(names(ref), "v_")]
  
  nhosp_ref = dim(ref)[1]
  nhosp_other = dim(other)[1]
  
  ## for each hospital in the reference system,
  ## calculate distance from each other hospital in the comparison system
  for(i in 1:nhosp_ref) {
    
      x = ref[i, .SD, .SDcols = keep]
      
      ref_dist <- c()
      
      for(j in 1:nhosp_other) {
        
        y = other[j, .SD, .SDcols = keep]
        
        ref_dist <- c(ref_dist, sqrt(sum(abs(colSums(rbind(x, -1*y))))))
      }

    output = output[i, dist := mean(ref_dist)]
    }  

  return(output)
}


# get observations for each hospital group in year y
gen_groups <- function(f_data = f_data, y) {
  
  d = read(f_data, y)
  
  # select hosp groups 
  d_leg = d[acq_legacy == 1, ]
  
  d_target = d[target == 1, ]
  
  d_target2 = d[target2 == 1, ]
  
  d_acqother = d[acq_other == 1, ]
  
  d_othernp = d[acqhosp == 0, ]
  
  out = list("legacy" = d_leg,
             "target" = d_target,
             "target2" = d_target2,
             "acqother" = d_acqother,
             "othernp" = d_othernp)
  
  return(out)
}


# compute the vendor distance matrix for one year 
get_dist_matrix <- function(f_data = f_data, y) {
  
  groups = gen_groups(f_data, y)

  names = names(groups)
  
  output_final = c()
  
  for(h in names){
   
    #h = "legacy"
    left = names 
    
    ref = groups[[`h`]]
    #ref = groups[["legacy"]]
    
    output = ref[, c("id", "year", "acq_legacy", "target", "target2", "acq_other", "acqhosp")]
    
    print(paste("Reference group for ", y, ": ", h, sep = ""))
    
    for(o in left) {
      
      # get the comparison group
      other = groups[[`o`]]
      
      print(paste("                Other group:", o))
      
      # compute distance 
      dist = avg_dist(ref, other)
      setnames(dist, old = "dist", new = paste("dist", o, sep = ""))
      
      # store
      output = merge(output, dist, by = "id")
      
      #output = output[, paste("dist", o, sep = "") := dist] 
      #output = output[, c('dist') := NULL]
      
      # reindex to the next remaining comparison group
      index = which(left == o)
      left = left[index+1:length(left)]
    }
    
  output_final = rbind(output_final, output)
  }
  
  return(output_final)
}

   
# join the distance matrices for all years between start and end 
# write to file
implement_all_years <- function(f_data = f_data, start, end) {
  
  output = c()
  
  for(y in seq(start, end)) {
    
    # compute distance matrix for y
    temp = get_dist_matrix(f_data, y)
    
    # attach 
    output = rbind(output, temp)
    
  }
  return(output)

}


##############################################################################################
# implement
##############################################################################################



if(1==0){
  start_time = Sys.time()
  
  
  
  ## specify the vendor indicator file (constructed in buld_himss)
  f_data = paste(fpath_himss, 
                 "vendor_distance_v3_20230606.dta",
                 sep = "") 
  
  ## call the above functions and write the annual files 
  ## merge the annual vendor distance file into a single hospital-year panel
  start = 2003
  end = 2014
  output = implement_all_years(f_data, start, end)
  
  #write to CSV and dta formats 
  fwrite(output, paste(fpath_himss, "vendor_distances_indiv", 
                       start, "_", end, "_20230606.csv", sep = ""))
  
  write_dta(data.frame(output),
            paste(fpath_himss, "vendor_distances_indiv", 
                  start, "_", end, "_20230606.dta", sep = ""))
  
  print("Writing vendor distance measures:")
  print(Sys.time() - start_time)
  }






  



  
  
  


